import re
import json

# 景区名称列表
scenic_spots = [
    "宝塔山",
    "大慈恩寺（大雁塔）",
    "大明宫国家遗址公园",
    "大唐芙蓉园",
    "法门寺",
    "华清宫",
    "华山风景名胜区",
    "黄帝陵",
    "黄河壶口瀑布旅游区",
    "金丝峡景区",
    "赳赳大秦",
    "秦始皇陵博物院（兵马俑）",
    "太白山国家森林公园",
    "西安碑林博物馆",
    "西安城墙",
    "延安革命纪念馆",
    "杨家岭革命旧址",
    "枣园革命旧址",
    "长安十二时辰主题街区",
    "中共中央西北局旧址",
    "黄河乾坤湾景区"
]

# 正则表达式匹配SQL语句中的字段
sql_pattern = re.compile(r"INSERT INTO `qa_qapair` \(`qa_id`, `jq_id`, `question`, `answer`, `last_operation_time`\)\s+VALUES \((\d+), '([^']+)', '([^']+)', '([^']+)', now\(\)\);")

def extract_data_from_sql(sql_file_path):
    data = []
    with open(sql_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            match = sql_pattern.search(line)
            if match:
                question = match.group(3)
                answer = match.group(4)
                scenic_spot = None
                for spot in scenic_spots:
                    if spot in question:
                        scenic_spot = spot
                        break
                if scenic_spot:
                    data.append({
                        "scenic_spot": scenic_spot,
                        "instruction": question,
                        "output": answer
                    })
    return data

def save_to_json(data, json_file_path):
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# 使用示例
sql_file_path = '../sqls/data_insert/qa_pair.sql'
json_file_path = 'Coze-knowledge/output.json'
data = extract_data_from_sql(sql_file_path)
save_to_json(data, json_file_path)